options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("corrplot")
##
## The downloaded binary packages are in
## /var/folders/fh/hhgmlzcn0p90_g2xb9ctfx680000gn/T//RtmpvhMj4t/downloaded_packages
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
library(reshape2)
library(RColorBrewer)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(naniar)
library(missForest)
#install.packages(c("ggplot2", "reshape2"))
library(corrplot)
patients <- read.csv("/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/patients_dirty_data.csv")
head(patients)
summary(patients)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI Pedigree Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Diagnosis
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
colnames(patients)
## [1] "Pregnancies" "Glucose" "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI" "Pedigree" "Age"
## [9] "Diagnosis"
str(patients)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure: int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness: int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ Pedigree : num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Diagnosis : int 1 0 1 0 1 0 1 0 1 1 ...
colSums(is.na(patients))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 0 0 0 0
## BMI Pedigree Age Diagnosis
## 0 0 0 0
Positive <- subset(patients, Diagnosis == 1)
head(Positive, 5)
ggplot(patients, aes(x = factor(Diagnosis))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Count of Gestational Diabetes Diagnoses",
x = "Diagnosis (0 = No, 1 = Yes)",
y = "Count"
) +
theme_minimal()
ggplot(patients, aes(x = BloodPressure, fill = ..count..)) +
geom_histogram(binwidth = 10, color = "black") +
scale_fill_viridis_c() +
labs(title = "Histogram of Blood Pressure",
x = "Blood Pressure",
y = "Count") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We can say that blood pressure is skewed left by looking at the graph.
ggplot(patients, aes(x = SkinThickness, fill = ..count..)) +
geom_histogram(binwidth = 10, color = "black") +
scale_fill_viridis_c() +
labs(title = "Skin Thickness Histogram",
x = "Skin Thickness",
y = "Count") +
theme_minimal()
We can say that skin thickness is skewed left by looking at the graph.
ggplot(patients, aes(x = Insulin, fill = ..count..)) +
geom_histogram(binwidth = 10, color = "black") +
scale_fill_viridis_c() +
labs(title = "Insulin Histogram",
x = "Insulin",
y = "Count") +
theme_minimal()
We can say that Insulin is skewed left by looking at the graph.
ggplot(patients, aes(x = BMI, fill = ..count..)) +
geom_histogram(binwidth = 2, color = "black") +
scale_fill_viridis_c() +
labs(title = "Histogram of BMI",
x = "BMI",
y = "Count") +
theme_minimal()
There is an observed concentration between 20-40
ggplot(patients, aes(x = factor(Diagnosis), y = Glucose, fill = factor(Diagnosis))) +
geom_boxplot(color = "black") +
scale_fill_manual(values = c("0" = "#56B4E9", "1" = "#E69F00"),
labels = c("No GDM", "GDM")) +
labs(
title = "Glucose Levels by Diagnosis",
x = "Gestational Diabetes Diagnosis",
y = "Glucose (mg/dL)",
fill = "Diagnosis"
) +
theme_minimal()
Patients diagnosed with GDM have higher Glucose levels.
ggplot(Positive, aes(x = BloodPressure, y = Glucose)) +
geom_point(color = "blue") +
labs(title = "BloodPressure & Glucose",
x = "BloodPressure",
y = "Glucose") +
theme_minimal()
ggplot(patients, aes(x = Age, y = Glucose, color = as.factor(Diagnosis))) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
name = "Diagnosis",
labels = c("Negative", "Positive")) +
labs(title = "Glucose vs Age Colored by Diagnosis",
x = "Age",
y = "Glucose") +
theme_minimal()
ggplot(patients, aes(x = Age, y = BloodPressure, color = as.factor(Diagnosis))) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
name = "Diagnosis",
labels = c("Negative", "Positive")) +
labs(title = "BloodPressure vs Age Colored by Diagnosis",
x = "Age",
y = "BloodPressure") +
theme_minimal()
ggplot(patients, aes(x = Age, y = SkinThickness, color = as.factor(Diagnosis))) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
name = "Diagnosis",
labels = c("Negative", "Positive")) +
labs(title = "SkinThickness vs Age Colored by Diagnosis",
x = "Age",
y = "SkinThickness") +
theme_minimal()
ggplot(patients, aes(x = Age, y = Insulin, color = as.factor(Diagnosis))) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
name = "Diagnosis",
labels = c("Negative", "Positive")) +
labs(title = "Insulin vs Age Colored by Diagnosis",
x = "Age",
y = "Insulin") +
theme_minimal()
ggplot(patients, aes(x = Age, y = BMI, color = as.factor(Diagnosis))) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
name = "Diagnosis",
labels = c("Negative", "Positive")) +
labs(title = "BMI vs Age Colored by Diagnosis",
x = "Age",
y = "BMI") +
theme_minimal()
# Assuming your data frame is called 'patients'
cor_matrix <- cor(patients[, sapply(patients, is.numeric)], use = "pairwise.complete.obs")
print(cor_matrix)
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789 0.33135711
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054 0.08893338
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000 0.43678257
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257 1.00000000
## BMI 0.01768309 0.22107107 0.28180529 0.39257320 0.19785906
## Pedigree -0.03352267 0.13733730 0.04126495 0.18392757 0.18507093
## Age 0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295
## Diagnosis 0.22189815 0.46658140 0.06506836 0.07475223 0.13054795
## BMI Pedigree Age Diagnosis
## Pregnancies 0.01768309 -0.03352267 0.54434123 0.22189815
## Glucose 0.22107107 0.13733730 0.26351432 0.46658140
## BloodPressure 0.28180529 0.04126495 0.23952795 0.06506836
## SkinThickness 0.39257320 0.18392757 -0.11397026 0.07475223
## Insulin 0.19785906 0.18507093 -0.04216295 0.13054795
## BMI 1.00000000 0.14064695 0.03624187 0.29269466
## Pedigree 0.14064695 1.00000000 0.03356131 0.17384407
## Age 0.03624187 0.03356131 1.00000000 0.23835598
## Diagnosis 0.29269466 0.17384407 0.23835598 1.00000000
# Starting from patients data frame:
patients_corr <- patients[, sapply(patients, is.numeric)]
# Now calculate correlation matrix
cor_mat <- cor(patients_corr, use = "pairwise.complete.obs")
corrplot(cor_mat, method = "color", type = "upper",
tl.col = "black", tl.srt = 45)
# Now use brewer.pal
cols <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)
# Then plot
corrplot::corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.8,
addCoef.col = "black",
col = cols)
# Leaving pregnancies zeros and Diagnosis zeros as these are meaningful
patients$Insulin[patients$Insulin == 0] <- NA
patients$BMI[patients$BMI == 0] <- NA
patients$SkinThickness[patients$SkinThickness == 0] <- NA
patients$BloodPressure[patients$BloodPressure == 0] <- NA
patients$Pedigree[patients$Pedigree == 0] <- NA
patients$Glucose[patients$Glucose == 0] <- NA
patients$Age[patients$Age == 0] <- NA
#Count NAs
sum(is.na(patients))
## [1] 652
#652 NAs total
sum(apply(patients, 1, function(row) any(is.na(row))))
## [1] 376
#376 rows have a missing value
sum(patients == 0, na.rm = TRUE)
## [1] 611
#611 zeros in total
sum(patients$Age == 0, na.rm = TRUE)
## [1] 0
#All Age zeros converted to NA
sum(patients$SkinThickness == 0, na.rm = TRUE)
## [1] 0
#All SkinTckness zeros converted to NA
sum(patients$Insulin == 0, na.rm = TRUE)
## [1] 0
#All Insulin zeros have been converted to NA
sum(patients$BMI == 0, na.rm = TRUE)
## [1] 0
#All BMI zeros have been converted to NA
sum(patients$Pedigree == 0, na.rm = TRUE)
## [1] 0
#All pedigree zeros have been converted to NA
sum(patients$Glucose == 0, na.rm = TRUE)
## [1] 0
#All glucose zeros have been converted to NA
sum(patients$BloodPressure == 0, na.rm = TRUE)
## [1] 0
#All blood pressure zeros have been converted to NA
sum(patients$Diagnosis == 0, na.rm = TRUE)
## [1] 500
sum(patients$Diagnosis == 1, na.rm = TRUE)
## [1] 268
Diagnosis and pregnancy zeros have been left as zeros (Binary- positive or negative diagnosis).
268 postive diagnosis in the dataset 500 negative diagnosis in the dataset
#correlation matrix
#Ignore NAs and visualize correlation between variables
cor_matrix2 <- cor(patients[, 1:9], use = "complete.obs")
print(cor_matrix2)
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## Pregnancies 1.000000000 0.1982910 0.2133548 0.0932094 0.07898363
## Glucose 0.198291043 1.0000000 0.2100266 0.1988558 0.58122301
## BloodPressure 0.213354775 0.2100266 1.0000000 0.2325712 0.09851150
## SkinThickness 0.093209397 0.1988558 0.2325712 1.0000000 0.18219906
## Insulin 0.078983625 0.5812230 0.0985115 0.1821991 1.00000000
## BMI -0.025347276 0.2095159 0.3044034 0.6643549 0.22639652
## Pedigree 0.007562116 0.1401802 -0.0159711 0.1604985 0.13590578
## Age 0.679608470 0.3436415 0.3000389 0.1677611 0.21708199
## Diagnosis 0.256565956 0.5157027 0.1926733 0.2559357 0.30142922
## BMI Pedigree Age Diagnosis
## Pregnancies -0.02534728 0.007562116 0.67960847 0.2565660
## Glucose 0.20951592 0.140180180 0.34364150 0.5157027
## BloodPressure 0.30440337 -0.015971104 0.30003895 0.1926733
## SkinThickness 0.66435487 0.160498526 0.16776114 0.2559357
## Insulin 0.22639652 0.135905781 0.21708199 0.3014292
## BMI 1.00000000 0.158771043 0.06981380 0.2701184
## Pedigree 0.15877104 1.000000000 0.08502911 0.2093295
## Age 0.06981380 0.085029106 1.00000000 0.3508038
## Diagnosis 0.27011841 0.209329511 0.35080380 1.0000000
corrplot(cor_matrix2, method = "circle")
#Highly correlated features:
#Age & number of pregnancies- 0.68
#Glucose & Insulin- 0.58
#BMI & skin thickness- 0.66
#Diagnosis & glucose- 0.515
#VIF
#model <- lm(Diagnosis ~ BMI + Age + Pregnancies + Pedigree + Glucose + BloodPressure, Insulin, SkinThickness, data = patients)
model <- lm(Diagnosis ~ BMI + Age + Pregnancies + Pedigree + Glucose + BloodPressure + Insulin + SkinThickness, data = patients)
vif(model)
## BMI Age Pregnancies Pedigree Glucose
## 1.979596 2.129433 1.900719 1.059315 1.670072
## BloodPressure Insulin SkinThickness
## 1.231815 1.556143 1.852772
There are some correlated values, but VIF scores are all below 3, which means there is no multicolinearity.
# Show rows with any missing data
patients[!complete.cases(patients), ]
# Visualize missingness by variable
gg_miss_var(patients)
# Visualize missing data patterns
vis_miss(patients)
colSums(is.na(patients))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 5 35 227 374
## BMI Pedigree Age Diagnosis
## 11 0 0 0
colSums(patients == 0, na.rm = TRUE)
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 111 0 0 0 0
## BMI Pedigree Age Diagnosis
## 0 0 0 500
#3 columns with <35 missing rows
#111 rows where pregnancies == 0
#500 rows where Diagnosis == 0
zero_or_na_per_row <- apply(patients, 1, function(row) {
sum(is.na(row))
})
(rows_3_or_more <- sum(zero_or_na_per_row >= 3))
## [1] 35
We see that there are 35 rows where blood pressure, skin thickness, and insulin are missing together. We choose to remove these rows at less than 4% of the data.
# Remove rows with 3 or more NAs
df_clean <- patients[rowSums(is.na(patients)) < 3, ]
gg_miss_upset(patients)
SkinThickness and Insulin missing together 192 times. Possible missingness explainations: more invasive testing, or these items are only done for certain clients or appointment types. This should be explored to see if data is MAR, MCAR, or MNAR.
length(patients$Insulin)
## [1] 768
length(patients$SkinThickness)
## [1] 768
patients$insulin_skin_missing <- ifelse(is.na(patients$Insulin) & is.na(patients$SkinThickness), 1, 0)
# Run t-test for BMI
t_result <- t.test(BMI ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: BMI by insulin_skin_missing
## t = 2.7363, df = 397.34, p-value = 0.006493
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.4284462 2.6153131
## sample estimates:
## mean in group 0 mean in group 1
## 32.89573 31.37385
# Run t-test for Pregnancies
t_result <- t.test(Pregnancies ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: Pregnancies by insulin_skin_missing
## t = -4.2626, df = 421.29, p-value = 2.496e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.646331 -0.607173
## sample estimates:
## mean in group 0 mean in group 1
## 3.512015 4.638767
# Run t-test for Age
t_result <- t.test(Age ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: Age by insulin_skin_missing
## t = -5.795, df = 360.35, p-value = 1.492e-08
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.624804 -3.760948
## sample estimates:
## mean in group 0 mean in group 1
## 31.55823 37.25110
# Run t-test for Pedigree
t_result <- t.test(Pedigree ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: Pedigree by insulin_skin_missing
## t = 4.7151, df = 525.84, p-value = 3.099e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.0650795 0.1580396
## sample estimates:
## mean in group 0 mean in group 1
## 0.5048503 0.3932907
# Run t-test for Glucose
t_result <- t.test(Glucose ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: Glucose by insulin_skin_missing
## t = -1.0567, df = 443.69, p-value = 0.2912
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.175451 2.157369
## sample estimates:
## mean in group 0 mean in group 1
## 120.9403 123.4493
# Run t-test for BloodPressure
t_result <- t.test(BloodPressure ~ insulin_skin_missing, data = patients)
print(t_result)
##
## Welch Two Sample t-test
##
## data: BloodPressure by insulin_skin_missing
## t = -3.4606, df = 340.58, p-value = 0.0006075
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -5.589413 -1.538200
## sample estimates:
## mean in group 0 mean in group 1
## 71.46197 75.02577
ggplot(patients, aes(x = factor(insulin_skin_missing), y = Age)) +
geom_boxplot(fill = c("#a6cee3", "#1f78b4")) +
labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Age",
title = "Age Distribution by Missingness of Insulin and SkinThickness") +
theme_minimal()
ggplot(patients, aes(x = factor(insulin_skin_missing), y = Pregnancies)) +
geom_boxplot(fill = c("#fdbf6f", "#ff7f00")) +
labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Number of Pregnancies",
title = "Pregnancies Distribution by Missingness of Insulin and SkinThickness") +
theme_minimal()
ggplot(patients, aes(x = factor(insulin_skin_missing), y = Pedigree)) +
geom_boxplot(fill = c("blue", "forestgreen")) +
labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Pedigree",
title = "Pedigree Distribution by Missingness of Insulin and SkinThickness") +
theme_minimal()
Analysis: There is a statistically significantly difference in several features means and missing or not missing the set of variables. We are suspecting that high missing rates of skin thickness and insulin are missing at random. Upon researching the domain, we suspect that older patients, or patients who have been pregnant a number of times before may often skip these measurements. They are not routine, and are possibly opted for only when there are other predictive factors observed. This indicates the missingness is at random (MAR). Imputation methods should be carefully considered using this information.
# Subset rows where Pregnancies == 0
preg0_df <- subset(patients, Pregnancies == 0)
# Count how many of those have Diagnosis == 1 (GDM)
gdm_with_preg0 <- sum(preg0_df$Diagnosis == 1, na.rm = TRUE)
# Total number of rows with Pregnancies == 0
total_preg0 <- nrow(preg0_df)
# Print results
cat("Total rows with Pregnancies == 0:", total_preg0, "\n")
## Total rows with Pregnancies == 0: 111
cat("Rows with Pregnancies == 0 AND Diagnosis == 1 (GDM):", gdm_with_preg0, "\n")
## Rows with Pregnancies == 0 AND Diagnosis == 1 (GDM): 38
There are 38 cases where pregnancy ==0 and diagnosis ==1. We will assume ‘Pregnancies’ == 0 means previous pregnancies.
any(duplicated(df_clean))
## [1] FALSE
colSums(is.na(df_clean))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 5 2 192 339
## BMI Pedigree Age Diagnosis
## 2 0 0 0
cols_to_clean <- names(which(colSums(is.na(df_clean)) < 6))
cols_to_clean
## [1] "Pregnancies" "Glucose" "BloodPressure" "BMI"
## [5] "Pedigree" "Age" "Diagnosis"
# Keep rows with no NA in those selected columns
df_clean <- df_clean[complete.cases(df_clean[, cols_to_clean]), ]
colSums(is.na(df_clean))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 0 0 192 332
## BMI Pedigree Age Diagnosis
## 0 0 0 0
The only columns with NAs remaining are SkinThickness and Insulin
#Total number of rows
(total_rows <- nrow(df_clean))
## [1] 724
#729 rows remaining after basic cleaning
# Percentage of missing SkinThickness
skin_missing_pct <- sum(is.na(df_clean$SkinThickness)) / total_rows * 100
# Percentage of missing Insulin
insulin_missing_pct <- sum(is.na(df_clean$Insulin)) / total_rows * 100
# Display the results
cat("Percentage of missing SkinThickness values:", round(skin_missing_pct, 2), "%\n")
## Percentage of missing SkinThickness values: 26.52 %
cat("Percentage of missing Insulin values:", round(insulin_missing_pct, 2), "%\n")
## Percentage of missing Insulin values: 45.86 %
27% of Skin thickness values are missing. 46% of Insulin values are missing.
vars_to_plot <- c("Insulin", "BMI", "SkinThickness", "BloodPressure", "Pedigree", "Glucose", "Age")
par(mfrow = c(4, 2), mar = c(4, 4, 2, 1))
for (var_name in vars_to_plot) {
boxplot(df_clean[[var_name]],
main = paste(var_name, "Boxplot"),
horizontal = TRUE,
col = "lightblue",
na.action = na.omit)
}
# BloodPressure issues (0 or < 40)
bp_issues <- df_clean[ df_clean$BloodPressure < 40, ]
cat("BloodPressure < 40:", sum(df_clean$BloodPressure < 40, na.rm = TRUE), "\n")
## BloodPressure < 40: 4
cat("Total BloodPressure issues:", nrow(bp_issues), "\n\n")
## Total BloodPressure issues: 4
To remove 4 instances of blood pressure errors
# SkinThickness issues (0 values)
skin_issues <- df_clean[df_clean$SkinThickness < 5 | df_clean$SkinThickness > 60, ]
cat("SkinThickness < 5:", sum(df_clean$SkinThickness < 5, na.rm = TRUE), "\n")
## SkinThickness < 5: 0
cat("SkinThickness > 60:", sum(df_clean$SkinThickness > 60, na.rm = TRUE), "\n")
## SkinThickness > 60: 2
To remove 2 Instances of skin thickness errors
# Glucose issues (< 50)
glucose_issues <- df_clean[df_clean$Glucose < 50, ]
cat("Glucose < 50:", nrow(glucose_issues), "\n\n")
## Glucose < 50: 1
To remove 6 instances of glucose errors
#Age
print(which(df_clean$Age > 50))
## [1] 8 11 12 13 22 26 28 37 41 50 63 86 108 116 122 133 140 176 193
## [20] 195 196 201 210 211 224 247 250 259 263 269 270 279 284 303 325 339 340 341
## [39] 352 365 378 429 431 432 447 451 458 460 465 468 478 481 485 486 488 503 508
## [58] 512 518 523 526 545 548 550 554 567 619 621 627 635 637 661 674 676 691 714
## [77] 716 720
print(which(df_clean$Age < 11))
## integer(0)
#Inspect rows
print(as.data.frame(df_clean[c(8, 11, 12, 13, 21, 25, 27, 36, 40, 49, 61, 84, 106, 114, 119, 130, 137, 173,
190, 192, 193, 198, 207, 208, 221, 244, 247, 256, 260, 266, 267, 276, 281, 300, 322, 336,
337, 338, 349, 362, 375, 426, 428, 429, 444, 448, 455, 457, 462, 465, 475, 478, 482, 483,
485, 500, 505, 509, 515, 520, 523, 544, 546, 550, 561, 613, 615, 621, 629, 631, 655, 668,
670, 685, 708, 710, 714), ]))
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age
## 9 2 197 70 45 543 30.5 0.158 53
## 13 10 139 80 NA NA 27.1 1.441 57
## 14 1 189 60 23 846 30.1 0.398 59
## 15 5 166 72 19 175 25.8 0.587 51
## 24 9 119 80 35 NA 29.0 0.263 29
## 28 1 97 66 15 140 23.2 0.487 22
## 30 5 117 92 NA NA 34.1 0.337 38
## 39 2 90 68 42 NA 38.2 0.503 27
## 43 7 106 92 18 NA 22.7 0.235 48
## 53 5 88 66 21 23 24.4 0.342 30
## 66 5 99 74 27 NA 29.0 0.203 32
## 92 4 123 80 15 176 32.0 0.443 34
## 114 4 76 62 NA NA 34.0 0.391 25
## 122 6 111 64 39 NA 34.2 0.260 24
## 127 3 120 70 30 135 42.9 0.452 30
## 138 0 93 60 25 92 28.7 0.532 22
## 145 4 154 62 31 284 32.8 0.237 23
## 184 5 73 60 NA NA 26.8 0.268 27
## 202 1 138 82 NA NA 40.1 0.236 28
## 204 2 99 70 16 44 20.4 0.235 27
## 205 6 103 72 32 190 37.7 0.324 55
## 210 7 184 84 33 NA 35.5 0.355 41
## 219 5 85 74 22 NA 29.0 1.224 32
## 220 5 112 66 NA NA 37.8 0.261 41
## 234 4 122 68 NA NA 35.0 0.394 29
## 257 3 111 56 39 NA 30.1 0.557 30
## 260 11 155 76 28 150 33.3 1.353 51
## 272 2 108 62 32 56 25.2 0.128 21
## 276 2 100 70 52 57 40.5 0.677 25
## 282 10 129 76 28 122 35.9 0.280 39
## 283 7 133 88 15 155 32.4 0.262 37
## 292 0 107 62 30 74 36.6 0.757 25
## 297 2 146 70 38 360 28.0 0.337 29
## 317 3 99 80 11 64 19.3 0.284 30
## 341 1 130 70 13 105 25.9 0.472 22
## 359 12 88 74 40 54 35.3 0.378 48
## 360 1 196 76 36 249 36.5 0.875 29
## 361 5 189 64 33 325 31.2 0.583 29
## 373 0 84 64 22 66 35.8 0.545 21
## 386 1 119 54 13 50 22.3 0.205 24
## 399 3 82 70 NA NA 21.1 0.389 25
## 453 0 91 68 32 210 39.9 0.381 25
## 456 14 175 62 30 NA 33.6 0.212 38
## 457 1 135 54 NA NA 26.7 0.687 62
## 473 0 119 66 27 NA 38.8 0.259 22
## 477 2 105 80 45 191 33.7 0.711 29
## 484 0 84 82 31 125 38.2 0.233 23
## 487 1 139 62 41 480 40.7 0.536 21
## 492 2 89 90 30 NA 33.5 0.292 42
## 496 6 166 74 NA NA 26.6 0.304 66
## 507 0 180 90 26 90 36.5 0.314 35
## 510 8 120 78 NA NA 25.0 0.409 64
## 514 2 91 62 NA NA 27.3 0.525 22
## 515 3 99 54 19 86 25.6 0.154 24
## 517 9 145 88 34 165 30.3 0.771 53
## 533 1 86 66 52 65 41.3 0.917 29
## 540 3 129 92 49 155 36.4 0.968 32
## 544 4 84 90 23 56 39.5 0.159 25
## 550 4 189 110 31 NA 28.5 0.680 37
## 555 1 84 64 23 115 36.9 0.471 28
## 558 8 110 76 NA NA 27.8 0.237 58
## 579 10 133 68 NA NA 27.0 0.245 36
## 581 0 151 90 46 NA 42.1 0.371 21
## 585 8 124 76 24 600 28.7 0.687 52
## 597 0 67 76 NA NA 45.3 0.194 46
## 653 5 123 74 40 77 34.1 0.269 28
## 655 1 106 70 28 135 34.2 0.142 22
## 661 10 162 84 NA NA 27.7 0.182 54
## 669 6 98 58 33 190 34.0 0.430 43
## 671 6 165 68 26 168 33.6 0.631 49
## 696 7 142 90 24 480 30.4 0.128 43
## 712 5 126 78 27 22 29.6 0.439 40
## 714 0 134 58 20 291 26.4 0.352 21
## 729 2 175 88 NA NA 22.9 0.326 22
## 752 1 121 78 39 74 39.0 0.261 28
## 754 0 181 88 44 510 43.3 0.222 26
## 758 0 123 72 NA NA 36.3 0.258 52
## Diagnosis
## 9 1
## 13 0
## 14 1
## 15 1
## 24 1
## 28 0
## 30 0
## 39 1
## 43 0
## 53 0
## 66 0
## 92 0
## 114 0
## 122 0
## 127 0
## 138 0
## 145 0
## 184 0
## 202 0
## 204 0
## 205 0
## 210 1
## 219 1
## 220 1
## 234 0
## 257 0
## 260 1
## 272 0
## 276 0
## 282 0
## 283 0
## 292 1
## 297 1
## 317 0
## 341 0
## 359 0
## 360 1
## 361 1
## 373 0
## 386 0
## 399 0
## 453 0
## 456 1
## 457 0
## 473 0
## 477 1
## 484 0
## 487 0
## 492 0
## 496 0
## 507 1
## 510 0
## 514 0
## 515 0
## 517 1
## 533 0
## 540 1
## 544 0
## 550 0
## 555 0
## 558 0
## 579 0
## 581 1
## 585 1
## 597 0
## 653 0
## 655 0
## 661 0
## 669 0
## 671 0
## 696 1
## 712 0
## 714 0
## 729 0
## 752 0
## 754 1
## 758 1
# Specify the row indices to remove
rows_to_remove <- c(8, 11, 12, 13, 21, 25, 27, 36, 40, 49, 61, 84, 106, 114, 119, 130, 137, 173,
190, 192, 193, 198, 207, 208, 221, 244, 247, 256, 260, 266, 267, 276, 281, 300,
322, 336, 337, 338, 349, 362, 375, 426, 428, 429, 444, 448, 455, 457, 462, 465,
475, 478, 482, 483, 485, 500, 505, 509, 515, 520, 523, 544, 546, 550, 561, 613,
615, 621, 629, 631, 655, 668, 670, 685, 708, 710, 714)
# Create new cleaned dataset by removing those rows
df_clean_2 <- df_clean[-rows_to_remove, ]
print(df_clean[c(419, 545, 16, 58, 118, 562, 564), ])
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age
## 446 0 180 78 63 14 59.4 2.420 25
## 580 2 197 70 99 NA 34.7 0.575 62
## 19 1 103 30 38 83 43.3 0.183 33
## 63 5 44 62 NA NA 25.0 0.587 36
## 126 1 88 30 42 99 55.0 0.496 26
## 598 1 89 24 19 25 27.8 0.559 21
## 600 1 109 38 18 120 23.1 0.407 26
## Diagnosis
## 446 1
## 580 1
## 19 0
## 63 0
## 126 1
## 598 0
## 600 0
#row 419 looks normal comparing to bmi- keep
# Remove rows 545, 16, 58, 118, 562, 564
rows_to_remove <- c(545, 16, 58, 118, 562, 564)
# Create new dataframe without these rows
df_clean_2 <- df_clean_2[-rows_to_remove, ]
df_clean_2 <- df_clean %>%
filter(BloodPressure >= 40) %>%
filter(SkinThickness >= 5 & SkinThickness <= 60) %>%
filter(Glucose >= 50) %>%
filter(Age >= 11 & Age <= 50)
df_clean_2 <- df_clean %>%
dplyr::filter(Age >= 11 & Age <= 50,
Glucose >= 50,
SkinThickness >= 5 & SkinThickness <= 60,
BloodPressure >= 40)
summary(df_clean_2)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 56.0 Min. : 40.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 97.0 1st Qu.: 64.00 1st Qu.:21.25
## Median : 2.000 Median :112.5 Median : 70.00 Median :29.00
## Mean : 3.294 Mean :118.9 Mean : 71.12 Mean :28.97
## 3rd Qu.: 5.000 3rd Qu.:137.0 3rd Qu.: 78.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :110.00 Max. :60.00
##
## Insulin BMI Pedigree Age
## Min. : 15.0 Min. :18.20 Min. :0.0850 Min. :21.0
## 1st Qu.: 75.5 1st Qu.:27.82 1st Qu.:0.2580 1st Qu.:23.0
## Median :120.0 Median :32.80 Median :0.4055 Median :27.0
## Mean :149.2 Mean :32.82 Mean :0.4938 Mean :29.5
## 3rd Qu.:182.0 3rd Qu.:36.88 3rd Qu.:0.6535 3rd Qu.:35.0
## Max. :744.0 Max. :67.10 Max. :2.3290 Max. :50.0
## NA's :127
## Diagnosis
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3086
## 3rd Qu.:1.0000
## Max. :1.0000
##
str(df_clean_2)
## 'data.frame': 486 obs. of 9 variables:
## $ Pregnancies : int 6 1 1 0 3 0 1 3 9 10 ...
## $ Glucose : int 148 85 89 137 78 118 115 126 119 125 ...
## $ BloodPressure: int 72 66 66 40 50 84 70 88 80 70 ...
## $ SkinThickness: int 35 29 23 35 32 47 30 41 35 26 ...
## $ Insulin : int NA NA 94 168 88 230 96 235 NA 115 ...
## $ BMI : num 33.6 26.6 28.1 43.1 31 45.8 34.6 39.3 29 31.1 ...
## $ Pedigree : num 0.627 0.351 0.167 2.288 0.248 ...
## $ Age : int 50 31 21 33 26 31 32 27 29 41 ...
## $ Diagnosis : int 1 0 0 1 1 1 1 0 1 1 ...
data_to_be_imputed = df_clean_2
imputed_data <- mice(data_to_be_imputed, method = "pmm", m = 5, seed = 123)
##
## iter imp variable
## 1 1 Insulin
## 1 2 Insulin
## 1 3 Insulin
## 1 4 Insulin
## 1 5 Insulin
## 2 1 Insulin
## 2 2 Insulin
## 2 3 Insulin
## 2 4 Insulin
## 2 5 Insulin
## 3 1 Insulin
## 3 2 Insulin
## 3 3 Insulin
## 3 4 Insulin
## 3 5 Insulin
## 4 1 Insulin
## 4 2 Insulin
## 4 3 Insulin
## 4 4 Insulin
## 4 5 Insulin
## 5 1 Insulin
## 5 2 Insulin
## 5 3 Insulin
## 5 4 Insulin
## 5 5 Insulin
imputed_df <- complete(imputed_data) # gets the first completed dataset
# Check NAs
colSums(is.na(imputed_df))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 0 0 0 0
## BMI Pedigree Age Diagnosis
## 0 0 0 0
summary(imputed_df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 56.0 Min. : 40.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 97.0 1st Qu.: 64.00 1st Qu.:21.25
## Median : 2.000 Median :112.5 Median : 70.00 Median :29.00
## Mean : 3.294 Mean :118.9 Mean : 71.12 Mean :28.97
## 3rd Qu.: 5.000 3rd Qu.:137.0 3rd Qu.: 78.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :110.00 Max. :60.00
## Insulin BMI Pedigree Age
## Min. : 15.0 Min. :18.20 Min. :0.0850 Min. :21.0
## 1st Qu.: 75.0 1st Qu.:27.82 1st Qu.:0.2580 1st Qu.:23.0
## Median :115.0 Median :32.80 Median :0.4055 Median :27.0
## Mean :146.2 Mean :32.82 Mean :0.4938 Mean :29.5
## 3rd Qu.:180.0 3rd Qu.:36.88 3rd Qu.:0.6535 3rd Qu.:35.0
## Max. :744.0 Max. :67.10 Max. :2.3290 Max. :50.0
## Diagnosis
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3086
## 3rd Qu.:1.0000
## Max. :1.0000
head(imputed_df)
#Final imputed dataset
rf_dataset_imputed <- imputed_df
write.csv(rf_dataset_imputed, "/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/completed_data_mice.csv", row.names = FALSE)
set.seed(123) # For reproducibility
# Save outcome variable (Diagnosis) separately
outcome_var <- data_to_be_imputed$Diagnosis
# Remove outcome before imputation
df_features_only <- data_to_be_imputed[, !(names(data_to_be_imputed) %in% "Diagnosis")]
# Check Insulin and SkinThickness presence before imputation
stopifnot("Insulin" %in% colnames(df_features_only))
stopifnot("SkinThickness" %in% colnames(df_features_only))
# Perform RF imputation on features only
rf_imputed <- missForest(df_features_only, maxiter = 10, ntree = 100)
# Extract completed data
rf_dataset_imputed <- rf_imputed$ximp
# Add outcome variable back
rf_dataset_imputed$Diagnosis <- outcome_var
# Check imputation results
colSums(is.na(rf_dataset_imputed))
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## 0 0 0 0 0
## BMI Pedigree Age Diagnosis
## 0 0 0 0
summary(rf_dataset_imputed)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 56.0 Min. : 40.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 97.0 1st Qu.: 64.00 1st Qu.:21.25
## Median : 2.000 Median :112.5 Median : 70.00 Median :29.00
## Mean : 3.294 Mean :118.9 Mean : 71.12 Mean :28.97
## 3rd Qu.: 5.000 3rd Qu.:137.0 3rd Qu.: 78.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :110.00 Max. :60.00
## Insulin BMI Pedigree Age
## Min. : 15.00 Min. :18.20 Min. :0.0850 Min. :21.0
## 1st Qu.: 84.66 1st Qu.:27.82 1st Qu.:0.2580 1st Qu.:23.0
## Median :120.00 Median :32.80 Median :0.4055 Median :27.0
## Mean :147.05 Mean :32.82 Mean :0.4938 Mean :29.5
## 3rd Qu.:180.00 3rd Qu.:36.88 3rd Qu.:0.6535 3rd Qu.:35.0
## Max. :744.00 Max. :67.10 Max. :2.3290 Max. :50.0
## Diagnosis
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3086
## 3rd Qu.:1.0000
## Max. :1.0000
head(rf_dataset_imputed)
write.csv(rf_dataset_imputed, "/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/completed_data_rf.csv", row.names = FALSE)